# Import obitab
#--------------
tmp <-
read.table(
"~/Dropbox/ECOFEED_microbiote_lizard/2015-2018/lez2015_2018_bioinfo/lez2018_s4paircontigs_s5tagmatch_s6highQ_2libs_s7uniq_s8nosingle_s9clust.tab",
h = TRUE,
sep = "\t"
)
# create reads table
#--------------
reads <- t(tmp[,grep("^sample\\.", colnames(tmp))])
colnames(reads) <- tmp$id
rownames(reads) <- gsub("^sample\\.", "", rownames(reads))
# create motus table
#--------------
motus <- tmp[,grep("^sample\\.", colnames(tmp), invert = TRUE)[-1]]
rownames(motus) <- tmp$id
# Import sample information
#--------------
tmp1 <-
read.table(
"~/Dropbox/ECOFEED_microbiote_lizard/2015-2018/lez2015_2018_bioinfo/Lez2015_2018_IDseq_editLDG.txt",
h = TRUE,
sep = "\t",
quote="", fill=FALSE
)
# create pcrs table
#--------------
pcrs <- data.frame(plate_no = tmp1$plate_number,
plate_col = tmp1$COLUMN,
plate_row = tmp1$ROW,
tag_fwd = sapply(strsplit(tmp1$tags, ":"), "[[", 1),
tag_rev = sapply(strsplit(tmp1$tags, ":"), "[[", 2),
primer_fwd = "GGATTAGATACCCTGGTAGT",
primer_rev = "CACGACACGAGCTGACG",
sample_id = tmp1$ID.seq,
id = tmp1$ID,
row.names = tmp1$ID.seq)
# specify pcr/control types in pcrs table
pcrs$type <- "sample"
pcrs$control_type <- NA
#field controls
pcrs$type[grep("ouvert", tmp1$ID)] <- "control"
pcrs$control_type[grep("ouvert", tmp1$ID)] <- "positive" # trick to include these particular controls since values for metabaR controls are fixed (extraction/pcr/sequencing/positive/NA). Added issue in metabaR GitHub for more flex for future version of the package.
#extraction controls
# only one "controle_extraction"... I will consider the field controls "ferme" as extraction controls too. NAs seem to be extraction controls too.
pcrs$type[grep("extraction|ferme", tmp1$ID)] <- "control"
pcrs$control_type[grep("extraction|ferme", tmp1$ID)] <- "extraction"
pcrs$type[is.na(tmp1$ID)] <- "control"
pcrs$control_type[is.na(tmp1$ID)] <- "extraction"
#pcrs controls (but may be extraction controls...)
pcrs$type[grep("BLANK", tmp1$ID)] <- "control"
pcrs$control_type[grep("BLANK", tmp1$ID)] <- "pcr"
#sequencing controls
# NUs
pcrs$type[grep("NU", tmp1$ID)] <- "control"
pcrs$control_type[grep("NU", tmp1$ID)] <- "sequencing"
#further info relevant to exp problems
pcrs$year <- tmp1$Annee
pcrs$boite_genet <- tmp1$Boite_Genet
pcrs$year <- tmp1$Annee
pcrs$control_closed <- tmp1$C_ferme
pcrs$control_open <- tmp1$C_ouvert
pcrs$clo_quality <- tmp1$Qualite_Clo
# create sample table
#--------------
# todo amend later, include only some infos from tmp1
id <- which(pcrs$type=="sample")
samples <- data.frame(id = tmp1$ID[id],
elev = tmp1$Elev[id],
age = tmp1$Age[id],
sex = tmp1$Sexe[id],
pop = tmp1$Pop[id],
notes = tmp1$Remarques[id],
year = tmp1$Annee[id],
sampling_date = tmp1$Date.Prelevement[id],
nprel = tmp1$Nprel[id],
qual_clo = tmp1$Qualite_Clo[id],
feces = tmp1$Feces[id],
divprel = tmp1$Divers.Prelevements[id],
row.names = tmp1$ID.seq[id])
# create the metabarlist object
#--------------
mblist <- metabarlist_generator(reads = reads,
motus = motus,
pcrs = pcrs,
samples = samples)## Warning in check_metabarlist(out): Some PCRs in out have a number of reads of
## zero in table `reads`!
# add taxonomic annotation
mblist <-
silva_annotator(metabarlist = mblist,
silva.path = "~/Dropbox/ECOFEED_microbiote_lizard/2015-2018/lez2015_2018_bioinfo/lez_2015_2018---ssu---otus.csv",
clust.path = "~/Dropbox/ECOFEED_microbiote_lizard/2015-2018/lez2015_2018_bioinfo/lez_2015_2018---ssu---sequence_cluster_map---lez2018_s4paircontigs_s5tagmatch_s6highQ_2libs_s7uniq_s8nosingle_s9clust_s10keepone.clstr")## Warning in check_metabarlist(metabarlist): Some PCRs in metabarlist have a
## number of reads of zero in table `reads`!
pcr type## [1] "GC_content" "ID" "ali_length"
## [4] "class" "cluster" "cluster_center"
## [7] "cluster_score" "cluster_weight" "count"
## [10] "direction" "distance" "experiment"
## [13] "forward_match" "forward_primer" "forward_score"
## [16] "forward_tag" "mode" "primers"
## [19] "reverse_match" "reverse_primer" "reverse_score"
## [22] "reverse_tag" "sampling_date" "seq_a_deletion"
## [25] "seq_a_insertion" "seq_a_mismatch" "seq_a_single"
## [28] "seq_ab_match" "seq_b_deletion" "seq_b_insertion"
## [31] "seq_b_mismatch" "seq_b_single" "seq_length"
## [34] "seq_length_ori" "status" "sequence"
## [37] "similarity" "superkingdom_silva" "kingdom_silva"
## [40] "phylum_silva" "class_silva" "order_silva"
## [43] "family_silva" "genus_silva" "lineage_silva"
## [1] "plate_no" "plate_col" "plate_row" "tag_fwd"
## [5] "tag_rev" "primer_fwd" "primer_rev" "sample_id"
## [9] "id" "type" "control_type" "year"
## [13] "control_closed" "control_open" "clo_quality"
## [1] "id" "elev" "age" "sex"
## [5] "pop" "notes" "year" "sampling_date"
## [9] "nprel" "qual_clo" "feces" "divprel"
## $dataset_dimension
## n_row n_col
## reads 1440 8070
## motus 8070 45
## pcrs 1440 15
## samples 986 12
##
## $dataset_statistics
## nb_reads nb_motus avg_reads sd_reads avg_motus sd_motus
## pcrs 11183625 8070 7766.406 13850.99 126.3639 165.1573
## samples 11124692 8047 11282.649 15520.82 175.5984 178.1532
Distribution of # reads and motus in each pcr, expecting no (or few) MOTUs/reads in sequencing, PCR and extraction controls. NA labels in the plots shown downstream will correspond to samples and positive controls to field open controls.
# store # reads and motus / pcr
mblist$pcrs$nb_reads <- rowSums(mblist$reads)
mblist$pcrs$nb_motus <- rowSums(mblist$reads>0)
# plot
check1 <- melt(mblist$pcrs[,c("control_type", "nb_reads", "nb_motus")])
ggplot(data = check1, aes(x = control_type, y = value, color = control_type)) +
geom_boxplot() +
theme_bw() +
scale_color_manual(values = c("brown", "red", "cyan4","pink"), na.value = "darkgrey") +
facet_wrap(~variable, scales = "free_y") +
theme(axis.text.x = element_text(angle=45, h=1))Few reads, but non negligible amounts of MOTUs in negative controls.
ggpcrtag(mblist, legend_title = "# of reads per PCR",
FUN = function(m) {
rowSums(m$reads)
},
taglist = as.character(unique(mblist$pcrs$tag_rev)))Sequencing control distribution strange (i.e. does not allow testing all tags).
Many samples with low sequencing depth. Let’s hope their signal is not similar to sequencing blanks. Potential problem with all samples with reverse tag “gcgtcagc”. This is the same tag that we spotted in the Poland data (generated in a different place)… => can be a pb of the tag itself in the reverse position?
NB: Appearance of an artifactual NA line at the top, not to be considered (ggpcrtag function pb? cannot identify it for now. Distribution of samples congruent with what indicated in “NGSfilters_lezards.xlsx”).
==> Flag samples with potentially corrupted reverse tag/
Rarefaction curves computed with the Hill numbers for \(q={0,1,2}\). They are equivalent to richness (\(q=0\)), the exponential of the Shannon index (\(q=1\)) and the inverse of the Simpson index (\(q=2\)). They give more or less weight to rare species that may correspond to molecular artifacts. The Good’s coverage index (i.e. proportion of motus that are not singletons) is also provide.
raref <- hill_rarefaction(mblist, nboot = 20, nsteps = 10)
saveRDS(raref, "~/Dropbox/ECOFEED_microbiote_lizard/2015-2018/lez2015_2018_bioinfo/lez2015_2018_rarcruves.rds")Vizualisation according to control types
# create vector with control type info for each pcr designated by their pcr names
control_type <- mblist$pcrs$control_type
names(control_type) <- rownames(mblist$pcrs)
raref = readRDS("~/Dropbox/ECOFEED_microbiote_lizard/2015-2018/lez2015_2018_bioinfo/lez2015_2018_rarcruves.rds")
gghill_rarefaction(raref, group = control_type) +
scale_color_manual(values = c("brown", "red", "cyan4","pink"), na.value = "darkgrey") +
scale_fill_manual(values = c("brown", "red", "cyan4","pink"), na.value = "darkgrey") +
labs(color="control_type")Ok, most controls not visible, saturation at \(^1D\) in all cases as expected.
Check whether the number of MOTUs and reads are correlated. We expect them to be especially correlated if many rare taxa are present, knowung that they are most likely errors. Should be visible in sequencing controls or samples with super high sequencing depth (higher likelihood of sequencing errors of abundant motus).
#also compute diversity at q=1
mblist$pcrs$q1 <- exp(diversity(mblist$reads, "shannon"))
# plot
tmp <- melt(mblist$pcrs[c("nb_reads", "nb_motus", "q1", "control_type")],
id.var = c("nb_reads", "control_type"))
tmp$variable <- gsub("nb_motus", "q0", tmp$variable)
ggplot(tmp, aes(x=nb_reads, y=value, color = control_type)) +
geom_point(alpha=0.2) + theme_bw() +
scale_y_log10() + scale_x_log10() +
scale_color_manual(values = c("brown", "red", "cyan4","pink"), na.value = "darkgrey") +
facet_wrap(~variable) +
labs(y="diversity") +
theme(legend.position = "bottom")## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Transformation introduced infinite values in continuous x-axis
Looks good. Samples with low # reads do not behave like sequencing controls.
Identify sequences more abundant (in frequency over the whole dataset) in negative controls.
# field
mblist <- contaslayer(mblist,
controls =
rownames(mblist$pcrs)[which(mblist$pcrs$control_type ==
"positive")],
output_col = "not_a_fieldopen_contaminant")
# extraction
mblist <- contaslayer(mblist,
controls =
rownames(mblist$pcrs)[which(mblist$pcrs$control_type ==
"extraction")],
output_col = "not_an_extraction_contaminant")
# pcr
mblist <- contaslayer(mblist,
controls =
rownames(mblist$pcrs)[which(mblist$pcrs$control_type ==
"pcr")],
output_col = "not_a_pcr_contaminant")
# sequencing
mblist <- contaslayer(mblist,
controls =
rownames(mblist$pcrs)[which(mblist$pcrs$control_type ==
"sequencing")],
output_col = "not_a_sequencing_contaminant")
# check if they overlap
table(rowSums(!mblist$motus[,grep("^not_a", colnames(mblist$motus))]))##
## 0 1
## 7987 83
# do not overlap ...
# flag contaminant sequences
mblist$motus$bias <- NA
mblist$motus$bias[which(!mblist$motus$not_a_fieldopen_contaminant)] <- "conta fieldopen"
mblist$motus$bias[which(!mblist$motus$not_an_extraction_contaminant)] <- "conta extraction"
mblist$motus$bias[which(!mblist$motus$not_a_pcr_contaminant)] <- "conta pcr"
mblist$motus$bias[which(!mblist$motus$not_a_sequencing_contaminant)] <- "conta sequencing" #0
library(kableExtra)
dt <- mblist$motus[!is.na(mblist$motus$bias), c("count", "similarity", "lineage_silva", "bias")]
kable(dt[order(dt[,1], decreasing=T),]) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
font_size = 8)| count | similarity | lineage_silva | bias | |
|---|---|---|---|---|
| M02944:302:000000000-C5NTF:1:1102:6533:3428_CONS_SUB_SUB | 2044 | 100.00 | Bacteria;Proteobacteria;Gammaproteobacteria;Enterobacteriales;Enterobacteriaceae;Pantoea; | conta fieldopen |
| M02944:302:000000000-C5NTF:1:1102:11787:2373_CONS_SUB_SUB | 1109 | 100.00 | Bacteria;Actinobacteria;Actinobacteria;Propionibacteriales;Propionibacteriaceae;Cutibacterium; | conta fieldopen |
| M02944:302:000000000-C5NTF:1:1102:18068:2552_CONS_SUB_SUB | 1048 | 100.00 | Bacteria;Proteobacteria;Alphaproteobacteria;Caulobacterales;Caulobacteraceae;uncultured; | conta fieldopen |
| M02944:302:000000000-C5NTF:1:1102:20248:4898_CONS_SUB_SUB_CMP | 611 | 100.00 | Bacteria;Actinobacteria;Actinobacteria;Micrococcales;Intrasporangiaceae;Janibacter; | conta fieldopen |
| M02944:302:000000000-C5NTF:1:1102:18194:23454_CONS_SUB_SUB_CMP | 311 | 100.00 | Bacteria;Actinobacteria;Actinobacteria;Micrococcales;Micrococcaceae;Micrococcus; | conta fieldopen |
| M02944:302:000000000-C5NTF:1:2112:6450:18874_CONS_SUB_SUB | 306 | 100.00 | Bacteria;Proteobacteria;Alphaproteobacteria;Rhodobacterales;Rhodobacteraceae;Paracoccus; | conta fieldopen |
| M02944:302:000000000-C5NTF:1:1102:18259:6772_CONS_SUB_SUB_CMP | 276 | 100.00 | Bacteria;Proteobacteria;Alphaproteobacteria;Rhodobacterales;Rhodobacteraceae;Paracoccus; | conta fieldopen |
| M02944:302:000000000-C5NTF:1:1102:26296:14637_CONS_SUB_SUB | 94 | 99.81 | Bacteria;Proteobacteria;Gammaproteobacteria;Xanthomonadales;Xanthomonadaceae;Pseudoxanthomonas; | conta fieldopen |
| M02944:302:000000000-C5NTF:1:1102:14820:15754_CONS_SUB_SUB | 69 | 100.00 | Bacteria;Proteobacteria;Alphaproteobacteria;Rhodobacterales;Rhodobacteraceae;Paracoccus; | conta fieldopen |
| M02944:302:000000000-C5NTF:1:1102:4586:12680_CONS_SUB_SUB_CMP | 60 | 100.00 | Bacteria;Proteobacteria;Alphaproteobacteria;Caulobacterales;Caulobacteraceae;Brevundimonas; | conta fieldopen |
| M02944:302:000000000-C5NTF:1:1101:13951:5153_CONS_SUB_SUB_CMP | 52 | 100.00 | Bacteria;Proteobacteria;Alphaproteobacteria;Sphingomonadales;Sphingomonadaceae;Novosphingobium; | conta fieldopen |
| M02944:302:000000000-C5NTF:1:2111:7786:7084_CONS_SUB_SUB | 46 | 99.23 | Bacteria;Proteobacteria;Alphaproteobacteria;Sphingomonadales;Sphingomonadaceae;Sphingomonas; | conta fieldopen |
| M02944:302:000000000-C5NTF:1:2114:20654:17967_CONS_SUB_SUB | 35 | 100.00 | Bacteria;Proteobacteria;Gammaproteobacteria;Pasteurellales;Pasteurellaceae;Haemophilus; | conta fieldopen |
| M02944:302:000000000-C5NTF:1:2115:5248:5005_CONS_SUB_SUB_CMP | 32 | 100.00 | Bacteria;Proteobacteria;Alphaproteobacteria;Sphingomonadales;Sphingomonadaceae;Sphingomonas; | conta fieldopen |
| M02944:302:000000000-C5NTF:1:2110:21140:20335_CONS_SUB_SUB | 28 | 100.00 | Bacteria;Proteobacteria;Alphaproteobacteria;Rhodobacterales;Rhodobacteraceae;Paracoccus; | conta fieldopen |
| M02944:302:000000000-C5NTF:1:2112:14487:4063_CONS_SUB_SUB_CMP | 25 | 100.00 | Bacteria;Bacteroidetes;Bacteroidia;Sphingobacteriales;Sphingobacteriaceae;Pedobacter; | conta fieldopen |
| M02944:302:000000000-C5NTF:1:1102:14741:6417_CONS_SUB_SUB_CMP | 15 | 95.27 | Bacteria;Proteobacteria;Alphaproteobacteria;Thalassobaculales;uncultured; | conta fieldopen |
| M02944:302:000000000-C5NTF:1:2106:4760:13375_CONS_SUB_SUB_CMP | 14 | 100.00 | Bacteria;Proteobacteria;Gammaproteobacteria;Oceanospirillales;Halomonadaceae;Halomonas; | conta fieldopen |
| M02944:302:000000000-C5NTF:1:1102:24835:7732_CONS_SUB_SUB | 14 | 100.00 | Bacteria;Actinobacteria;Actinobacteria;Micrococcales;Microbacteriaceae;Galbitalea; | conta extraction |
| M02944:302:000000000-C5NTF:1:1101:15462:20664_CONS_SUB_SUB | 13 | 100.00 | Bacteria;Proteobacteria;Alphaproteobacteria;Rhodobacterales;Rhodobacteraceae;Paracoccus; | conta fieldopen |
| M02944:302:000000000-C5NTF:1:1101:10358:4588_CONS_SUB_SUB_CMP | 13 | 100.00 | Bacteria;Proteobacteria;Alphaproteobacteria;Sphingomonadales;Sphingomonadaceae;Ellin6055; | conta fieldopen |
| M02944:302:000000000-C5NTF:1:1111:12930:3349_CONS_SUB_SUB | 11 | 100.00 | Bacteria;Firmicutes;Negativicutes;Selenomonadales;Veillonellaceae;Veillonella; | conta fieldopen |
| M02944:302:000000000-C5NTF:1:1112:16267:16761_CONS_SUB_SUB | 11 | 97.07 | Bacteria;Actinobacteria;Coriobacteriia;Coriobacteriales;Coriobacteriaceae;Collinsella; | conta extraction |
| M02944:302:000000000-C5NTF:1:2113:12392:2891_CONS_SUB_SUB_CMP | 10 | 100.00 | Bacteria;Proteobacteria;Alphaproteobacteria;Sphingomonadales;Sphingomonadaceae;Altererythrobacter; | conta extraction |
| M02944:302:000000000-C5NTF:1:2113:11358:18807_CONS_SUB_SUB_CMP | 9 | 100.00 | Bacteria;Actinobacteria;Actinobacteria;Corynebacteriales;Corynebacteriaceae;Corynebacterium 1; | conta extraction |
| M02944:302:000000000-C5NTF:1:2109:29037:13131_CONS_SUB_SUB_CMP | 9 | 100.00 | Bacteria;Firmicutes;Bacilli;Bacillales;Family XI;Gemella; | conta extraction |
| M02944:302:000000000-C5NTF:1:1116:16243:8855_CONS_SUB_SUB | 8 | 99.81 | Bacteria;Actinobacteria;Actinobacteria;Kineosporiales;Kineosporiaceae;Kineosporia; | conta extraction |
| M02944:302:000000000-C5NTF:1:2105:9191:9805_CONS_SUB_SUB_CMP | 8 | 86.20 | Bacteria;Epsilonbacteraeota;Campylobacteria;Campylobacterales;Helicobacteraceae;Helicobacter; | conta extraction |
| M02944:302:000000000-C5NTF:1:2114:6512:11004_CONS_SUB_SUB | 7 | 99.81 | Bacteria;Proteobacteria;Gammaproteobacteria;Pseudomonadales;Pseudomonadaceae;Pseudomonas; | conta fieldopen |
| M02944:302:000000000-C5NTF:1:2104:15750:6432_CONS_SUB_SUB_CMP | 7 | 97.49 | Bacteria;Firmicutes;Clostridia;Clostridiales;Lachnospiraceae;Mobilitalea; | conta extraction |
| M02944:302:000000000-C5NTF:1:2108:9981:18629_CONS_SUB_SUB_CMP | 7 | 87.50 | Bacteria;Epsilonbacteraeota;Campylobacteria;Campylobacterales;Helicobacteraceae;Helicobacter; | conta fieldopen |
| M02944:302:000000000-C5NTF:1:2108:15503:15034_CONS_SUB_SUB_CMP | 7 | 100.00 | Bacteria;Proteobacteria;Gammaproteobacteria;Betaproteobacteriales;Burkholderiaceae;uncultured; | conta extraction |
| M02944:302:000000000-C5NTF:1:2115:18620:24734_CONS_SUB_SUB_CMP | 7 | 99.61 | Bacteria;Bacteroidetes;Bacteroidia;Cytophagales;Spirosomaceae;Spirosoma; | conta fieldopen |
| M02944:302:000000000-C5NTF:1:1101:23049:19885_CONS_SUB_SUB | 7 | 96.73 | Bacteria;Firmicutes;Clostridia;Clostridiales;Lachnospiraceae;Lachnospiraceae UCG-008; | conta extraction |
| M02944:302:000000000-C5NTF:1:2113:13297:6472_CONS_SUB_SUB_CMP | 7 | 100.00 | Bacteria;Proteobacteria;Alphaproteobacteria;Rhizobiales;Beijerinckiaceae;Microvirga; | conta fieldopen |
| M02944:302:000000000-C5NTF:1:1101:24408:13225_CONS_SUB_SUB_CMP | 7 | 99.03 | Bacteria;Firmicutes;Clostridia;Clostridiales;Lachnospiraceae;Robinsoniella; | conta fieldopen |
| M02944:302:000000000-C5NTF:1:1101:19336:16054_CONS_SUB_SUB | 6 | 100.00 | Bacteria;Proteobacteria;Gammaproteobacteria;Betaproteobacteriales;Methylophilaceae;uncultured; | conta extraction |
| M02944:302:000000000-C5NTF:1:2115:23975:12882_CONS_SUB_SUB | 6 | 99.81 | Bacteria;Actinobacteria;Actinobacteria;Micrococcales;Micrococcaceae;Kocuria; | conta fieldopen |
| M02944:302:000000000-C5NTF:1:2107:25916:8830_CONS_SUB_SUB | 5 | 100.00 | Bacteria;Proteobacteria;Gammaproteobacteria;Pasteurellales;Pasteurellaceae;Actinobacillus; | conta fieldopen |
| M02944:302:000000000-C5NTF:1:1102:6720:12462_CONS_SUB_SUB | 5 | 98.41 | Bacteria;Bacteroidetes;Bacteroidia;Bacteroidales;Bacteroidaceae;Bacteroides; | conta extraction |
| M02944:302:000000000-C5NTF:1:2105:17441:8760_CONS_SUB_SUB | 5 | 96.51 | Bacteria;Firmicutes;Clostridia;Clostridiales;Lachnospiraceae;Lachnospiraceae NK3A20 group; | conta extraction |
| M02944:302:000000000-C5NTF:1:2112:17378:4484_CONS_SUB_SUB_CMP | 5 | 99.81 | Bacteria;Proteobacteria;Alphaproteobacteria;Sphingomonadales;Sphingomonadaceae;Sphingopyxis; | conta fieldopen |
| M02944:302:000000000-C5NTF:1:1112:14910:12745_CONS_SUB_SUB_CMP | 5 | 96.00 | Bacteria;Bacteroidetes;Bacteroidia;Bacteroidales;Rikenellaceae;Rikenella; | conta fieldopen |
| M02944:302:000000000-C5NTF:1:2114:27440:8582_CONS_SUB_SUB_CMP | 4 | 100.00 | Bacteria;Proteobacteria;Gammaproteobacteria;Betaproteobacteriales;Burkholderiaceae;Rhizobacter; | conta extraction |
| M02944:302:000000000-C5NTF:1:2111:12145:14422_CONS_SUB_SUB_CMP | 4 | 100.00 | Bacteria;Actinobacteria;Actinobacteria;Corynebacteriales;Corynebacteriaceae;Turicella; | conta extraction |
| M02944:302:000000000-C5NTF:1:1108:6125:12384_CONS_SUB_SUB | 4 | 100.00 | Bacteria;Bacteroidetes;Bacteroidia;Bacteroidales;Dysgonomonadaceae;Dysgonomonas; | conta extraction |
| M02944:302:000000000-C5NTF:1:2115:2578:14900_CONS_SUB_SUB_CMP | 4 | 100.00 | Bacteria;Proteobacteria;Alphaproteobacteria;Caulobacterales;Caulobacteraceae;Asticcacaulis; | conta extraction |
| M02944:302:000000000-C5NTF:1:1108:11664:10574_CONS_SUB_SUB | 4 | 99.81 | Bacteria;Actinobacteria;Actinobacteria;Micrococcales;Brevibacteriaceae;Brevibacterium; | conta fieldopen |
| M02944:302:000000000-C5NTF:1:1111:4503:9208_CONS_SUB_SUB_CMP | 4 | 96.34 | Bacteria;Firmicutes;Clostridia;Clostridiales;Ruminococcaceae;Ruminococcus 1; | conta extraction |
| M02944:302:000000000-C5NTF:1:1105:21258:17018_CONS_SUB_SUB_CMP | 4 | 96.88 | Bacteria;Firmicutes;Clostridia;Clostridiales;Ruminococcaceae;Faecalibacterium; | conta fieldopen |
| M02944:302:000000000-C5NTF:1:2110:21651:19187_CONS_SUB_SUB_CMP | 4 | 95.00 | Bacteria;Firmicutes;Clostridia;Clostridiales;Ruminococcaceae;Faecalibacterium; | conta extraction |
| M02944:302:000000000-C5NTF:1:2104:25502:9867_CONS_SUB_SUB_CMP | 4 | 89.16 | Bacteria;Firmicutes;Clostridia;Clostridiales;Ruminococcaceae;Ruminococcaceae UCG-014; | conta extraction |
| M02944:302:000000000-C5NTF:1:2113:21235:9318_CONS_SUB_SUB | 3 | 100.00 | Bacteria;Proteobacteria;Alphaproteobacteria;Caulobacterales;Caulobacteraceae;Caulobacter; | conta extraction |
| M02944:302:000000000-C5NTF:1:1115:20591:7956_CONS_SUB_SUB_CMP | 3 | 95.91 | Bacteria;Proteobacteria;Alphaproteobacteria;Rhizobiales;Rhizobiaceae;Allorhizobium-Neorhizobium-Pararhizobium-Rhizobium; | conta extraction |
| M02944:302:000000000-C5NTF:1:1106:24209:10282_CONS_SUB_SUB | 3 | 87.75 | Bacteria;Bacteroidetes;Bacteroidia;Bacteroidales;Bacteroidaceae;Bacteroides; | conta extraction |
| M02944:302:000000000-C5NTF:1:1112:10846:10094_CONS_SUB_SUB | 3 | 86.19 | Bacteria;Firmicutes;Clostridia;Clostridiales;Ruminococcaceae;Ruminiclostridium 6; | conta fieldopen |
| M02944:302:000000000-C5NTF:1:2116:14852:8020_CONS_SUB_SUB_CMP | 3 | 94.10 | Bacteria;Firmicutes;Clostridia;Clostridiales;Lachnospiraceae;ASF356; | conta fieldopen |
| M02944:302:000000000-C5NTF:1:1106:28966:14237_CONS_SUB_SUB | 3 | 88.26 | Bacteria;Bacteroidetes;Bacteroidia;Bacteroidales;Bacteroidaceae;Bacteroides; | conta pcr |
| M02944:302:000000000-C5NTF:1:1110:8393:19940_CONS_SUB_SUB_CMP | 2 | 96.51 | Bacteria;Firmicutes;Clostridia;Clostridiales;Family XIII;Family XIII UCG-001; | conta fieldopen |
| M02944:302:000000000-C5NTF:1:2114:8675:6203_CONS_SUB_SUB | 2 | 94.19 | Bacteria;Firmicutes;Clostridia;Clostridiales;Lachnospiraceae;Lachnospiraceae NK3A20 group; | conta fieldopen |
| M02944:302:000000000-C5NTF:1:2107:25297:23116_CONS_SUB_SUB | 2 | 99.81 | Bacteria;Firmicutes;Bacilli;Lactobacillales;Streptococcaceae;Streptococcus; | conta pcr |
| M02944:302:000000000-C5NTF:1:2112:25256:10745_CONS_SUB_SUB | 2 | 99.81 | Bacteria;Proteobacteria;Deltaproteobacteria;Oligoflexales;Oligoflexaceae;Oligoflexus; | conta extraction |
| M02944:302:000000000-C5NTF:1:1106:26438:16644_CONS_SUB_SUB_CMP | 2 | 97.10 | Bacteria;Firmicutes;Clostridia;Clostridiales;Family XIII;uncultured; | conta fieldopen |
| M02944:302:000000000-C5NTF:1:1111:19295:22326_CONS_SUB_SUB_CMP | 2 | 99.81 | Bacteria;Actinobacteria;Acidimicrobiia;Microtrichales;Ilumatobacteraceae;CL500-29 marine group; | conta fieldopen |
| M02944:302:000000000-C5NTF:1:1102:14654:8734_CONS_SUB_SUB | 2 | 100.00 | Bacteria;Deinococcus-Thermus;Deinococci;Deinococcales;Deinococcaceae;Deinococcus; | conta fieldopen |
| M02944:302:000000000-C5NTF:1:1115:13182:19432_CONS_SUB_SUB_CMP | 2 | 95.37 | Bacteria;Bacteroidetes;Bacteroidia;Bacteroidales;Paludibacteraceae;Paludibacter; | conta extraction |
| M02944:302:000000000-C5NTF:1:2102:4700:11012_CONS_SUB_SUB | 2 | 96.90 | Bacteria;Firmicutes;Bacilli;Lactobacillales;Streptococcaceae;Streptococcus; | conta fieldopen |
| M02944:302:000000000-C5NTF:1:2109:19958:20100_CONS_SUB_SUB | 2 | 100.00 | Bacteria;Proteobacteria;Gammaproteobacteria;Alteromonadales;Marinobacteraceae;Marinobacter; | conta fieldopen |
| M02944:302:000000000-C5NTF:1:1105:10700:11474_CONS_SUB_SUB | 2 | 100.00 | Bacteria;Bacteroidetes;Bacteroidia;Bacteroidales;Porphyromonadaceae;Porphyromonas; | conta fieldopen |
| M02944:302:000000000-C5NTF:1:2103:8240:21223_CONS_SUB_SUB_CMP | 2 | 88.00 | Bacteria;Epsilonbacteraeota;Campylobacteria;Campylobacterales;Helicobacteraceae;Helicobacter; | conta extraction |
| M02944:302:000000000-C5NTF:1:2113:17367:22840_CONS_SUB_SUB | 2 | 99.62 | Bacteria;BRC1; | conta fieldopen |
| M02944:302:000000000-C5NTF:1:2115:24129:14093_CONS_SUB_SUB_CMP | 2 | 96.14 | Bacteria;Firmicutes;Bacilli;Lactobacillales;P5D1-392; | conta fieldopen |
| M02944:302:000000000-C5NTF:1:2110:21441:3238_CONS_SUB_SUB_CMP | 2 | 96.31 | Bacteria;Proteobacteria;Deltaproteobacteria;Desulfovibrionales;Desulfovibrionaceae;Desulfovibrio; | conta fieldopen |
| M02944:302:000000000-C5NTF:1:2108:18727:17807_CONS_SUB_SUB_CMP | 2 | 100.00 | Bacteria;Actinobacteria;Coriobacteriia;OPB41; | conta fieldopen |
| M02944:302:000000000-C5NTF:1:2109:10408:10892_CONS_SUB_SUB | 2 | 98.65 | Bacteria;Proteobacteria;Gammaproteobacteria;Betaproteobacteriales;Burkholderiaceae;Limnobacter; | conta extraction |
| M02944:302:000000000-C5NTF:1:1113:18203:8055_CONS_SUB_SUB | 2 | 97.62 | Bacteria;Bacteroidetes;Bacteroidia;Flavobacteriales;Weeksellaceae;Chryseobacterium; | conta extraction |
| M02944:302:000000000-C5NTF:1:1101:17541:24684_CONS_SUB_SUB | 2 | 98.25 | Bacteria;Chlamydiae;Chlamydiae;Chlamydiales;Parachlamydiaceae;uncultured; | conta extraction |
| M02944:302:000000000-C5NTF:1:1101:7017:6084_CONS_SUB_SUB | 2 | 96.40 | Bacteria;Epsilonbacteraeota;Campylobacteria;Campylobacterales;Helicobacteraceae;Helicobacter; | conta extraction |
| M02944:302:000000000-C5NTF:1:2106:12685:12487_CONS_SUB_SUB_CMP | 2 | 99.80 | Bacteria;Firmicutes;Clostridia;Clostridiales;Family XI;Peptoniphilus; | conta extraction |
| M02944:302:000000000-C5NTF:1:1110:29102:15287_CONS_SUB_SUB_CMP | 2 | 99.62 | Bacteria;Proteobacteria;Gammaproteobacteria;Xanthomonadales;Xanthomonadaceae;Xanthomonas; | conta extraction |
| M02944:302:000000000-C5NTF:1:2104:16591:18431_CONS_SUB_SUB_CMP | 2 | 95.69 | Bacteria;Bacteroidetes;Bacteroidia;Bacteroidales;Bacteroidaceae;Bacteroides; | conta fieldopen |
| M02944:302:000000000-C5NTF:1:2106:15440:22800_CONS_SUB_SUB | 2 | 90.50 | Bacteria;Fusobacteria;Fusobacteriia;Fusobacteriales;Fusobacteriaceae;Fusobacterium; | conta extraction |
| M02944:302:000000000-C5NTF:1:1111:19632:6206_CONS_SUB_SUB_CMP | 2 | 100.00 | Bacteria;Proteobacteria;Alphaproteobacteria;Sphingomonadales;Sphingomonadaceae;Sphingorhabdus; | conta pcr |
Distribution of the top 3 contaminants in negative controls on the pcr plates.
for (i in 1:3){
max.conta <- rownames(dt)[order(dt$count, decreasing=T)[i]]
p <- ggpcrtag(mblist, legend_title = "# of reads of the most abundant conta",
FUN = function(m) {m$reads[,max.conta]},
taglist = as.character(unique(mblist$pcrs$tag_rev)))
print(as.vector(mblist$motus[max.conta,"lineage_silva"]))
print(p)
}## [1] "Bacteria;Proteobacteria;Gammaproteobacteria;Enterobacteriales;Enterobacteriaceae;Pantoea;"
## [1] "Bacteria;Actinobacteria;Actinobacteria;Propionibacteriales;Propionibacteriaceae;Cutibacterium;"
## [1] "Bacteria;Proteobacteria;Alphaproteobacteria;Caulobacterales;Caulobacteraceae;uncultured;"
The first is a plant pathogen and responsible for some infection in humans, the second is associated with human skin (one sp member of that genus cause acne).
Distribution in the whole dataset
tmp <- melt(decostand(mblist$reads, "total")[,!is.na(mblist$motus$bias)])
tmp$sample_type <- mblist$pcrs$control_type[match(tmp$Var1, rownames(mblist$pcrs))]
tmp$conta_type <- gsub(" ", "_", mblist$motus$bias[match(tmp$Var2, rownames(mblist$motus))])
ggplot(tmp, aes(x=sample_type, y=value)) +
geom_boxplot(color="grey40") +
geom_jitter(aes(color=sample_type), width = 0.2, alpha=0.5) +
facet_wrap(~conta_type, ncol=4,
labeller = label_parsed) +
scale_color_manual(values = c("brown", "red", "cyan4","pink"), na.value = "darkgrey") +
labs(x=NULL, y="Prop. Reads (log10)") +
theme_bw() +
theme(panel.grid = element_blank(),
axis.text.x = element_text(angle=45, hjust=1),
strip.background = element_blank(),
strip.text = element_text(face="bold", hjust=0),
legend.position = "bottom") +
scale_y_log10()Motus detected as contaminants are overall in low abundance in samples, except for field open sample.. Difficult to say if its indeed true contaminants or contaminations with biological samples at this stage…
Flag samples with too many contaminants.
tmp <-
data.frame(conta.relab = rowSums(decostand(mblist$reads, "total")[, !is.na(mblist$motus$bias)]))
tmp$sample_type <- mblist$pcrs$control_type[match(rownames(tmp), rownames(mblist$pcrs))]
p = ggplot(tmp, aes(x=sample_type, y=conta.relab)) +
geom_boxplot(color="grey40") +
geom_jitter(aes(color=sample_type), width = 0.2, alpha=0.5) +
scale_color_manual(values = c("brown", "red", "cyan4","pink"), na.value = "darkgrey") +
labs(x=NULL, y="Prop. Reads (log10)") +
theme_bw() +
scale_y_log10()
thresh <- 5e-02
p +
geom_hline(yintercept = thresh, lty=2, col="darkgreen", lwd=1) +
annotate("text", x=1, y=thresh, label="potential failed PCR: too many contaminants",
col="darkgreen", hjust=0, fontface="bold")mblist$pcrs$pcrbias[which(tmp$conta.relab[match(rownames(mblist$pcrs), rownames(tmp))] > thresh)] <-
ifelse(
is.na(mblist$pcrs$pcrbias[which(tmp$conta.relab[match(rownames(mblist$pcrs), rownames(tmp))] >
thresh)]), "too much conta",
paste(mblist$pcrs$pcrbias[which(tmp$conta.relab[match(rownames(mblist$pcrs), rownames(tmp))] >
thresh)], "too much conta", sep = "|")
)Exclude archaea (not targeted by the primers -> biased amplificaiton), and eukaryotes mitochondria or chloroplasts.
idx <- grep("Bacteria", mblist$motus$lineage_silva, invert = TRUE)
conta_bio <- rownames(mblist$motus)[idx]
#tag motus accordingly (append with previous bias identified)
mblist$motus$bias[match(conta_bio, rownames(mblist$motus))] <-
ifelse(is.na(mblist$motus$bias[match(conta_bio, rownames(mblist$motus))]), "untargeted clade",
paste(mblist$motus$bias[match(conta_bio, rownames(mblist$motus))], "untargeted clade",
sep="|"))
#show them in a table
dt <- mblist$motus[conta_bio, c("count", "similarity", "lineage_silva", "bias")]
kable(dt[order(dt[,1], decreasing=T),]) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
font_size = 8)| count | similarity | lineage_silva | bias | |
|---|---|---|---|---|
| M02944:302:000000000-C5NTF:1:1102:19828:10474_CONS_SUB_SUB | 79 | NA | NA | untargeted clade |
| M02944:302:000000000-C5NTF:1:1102:26062:21194_CONS_SUB_SUB | 30 | NA | NA | untargeted clade |
| M02944:302:000000000-C5NTF:1:2104:4756:19425_CONS_SUB_SUB | 27 | NA | NA | untargeted clade |
| M02944:302:000000000-C5NTF:1:1102:24563:19138_CONS_SUB_SUB | 26 | NA | NA | untargeted clade |
| M02944:302:000000000-C5NTF:1:2111:15208:24009_CONS_SUB_SUB_CMP | 21 | NA | NA | untargeted clade |
| M02944:302:000000000-C5NTF:1:2112:24836:11578_CONS_SUB_SUB_CMP | 20 | NA | NA | untargeted clade |
| M02944:302:000000000-C5NTF:1:2115:21232:10051_CONS_SUB_SUB | 17 | NA | NA | untargeted clade |
| M02944:302:000000000-C5NTF:1:1108:7601:7020_CONS_SUB_SUB_CMP | 9 | NA | NA | untargeted clade |
| M02944:302:000000000-C5NTF:1:1105:20803:23631_CONS_SUB_SUB | 6 | NA | NA | untargeted clade |
| M02944:302:000000000-C5NTF:1:2115:15013:17213_CONS_SUB_SUB | 5 | NA | NA | untargeted clade |
| M02944:302:000000000-C5NTF:1:2119:3851:11610_CONS_SUB_SUB_CMP | 5 | NA | NA | untargeted clade |
| M02944:302:000000000-C5NTF:1:2113:27150:5316_CONS_SUB_SUB | 4 | NA | NA | untargeted clade |
| M02944:302:000000000-C5NTF:1:2107:9536:16765_CONS_SUB_SUB | 4 | NA | NA | untargeted clade |
| M02944:302:000000000-C5NTF:1:2112:18642:18647_CONS_SUB_SUB | 3 | NA | NA | untargeted clade |
| M02944:302:000000000-C5NTF:1:2110:23135:6738_CONS_SUB_SUB_CMP | 3 | NA | NA | untargeted clade |
| M02944:302:000000000-C5NTF:1:1102:22699:18659_CONS_SUB_SUB | 3 | NA | NA | untargeted clade |
| M02944:302:000000000-C5NTF:1:2105:11769:6823_CONS_SUB_SUB | 3 | NA | NA | untargeted clade |
| M02944:302:000000000-C5NTF:1:2112:12661:17767_CONS_SUB_SUB_CMP | 3 | NA | NA | untargeted clade |
| M02944:302:000000000-C5NTF:1:2104:7016:11544_CONS_SUB_SUB | 3 | NA | NA | untargeted clade |
| M02944:302:000000000-C5NTF:1:1109:6445:10780_CONS_SUB_SUB | 2 | NA | NA | untargeted clade |
| M02944:302:000000000-C5NTF:1:2106:19509:21253_CONS_SUB_SUB | 2 | NA | NA | untargeted clade |
| M02944:302:000000000-C5NTF:1:2115:25205:16362_CONS_SUB_SUB_CMP | 2 | NA | NA | untargeted clade |
| M02944:302:000000000-C5NTF:1:1103:26303:16803_CONS_SUB_SUB | 2 | NA | NA | untargeted clade |
| M02944:302:000000000-C5NTF:1:1110:3058:16325_CONS_SUB_SUB_CMP | 2 | NA | NA | untargeted clade |
| M02944:302:000000000-C5NTF:1:2103:26934:4823_CONS_SUB_SUB_CMP | 2 | NA | NA | untargeted clade |
| M02944:302:000000000-C5NTF:1:2103:26813:5575_CONS_SUB_SUB | 2 | NA | NA | untargeted clade |
| M02944:302:000000000-C5NTF:1:1111:17595:14021_CONS_SUB_SUB | 2 | NA | NA | untargeted clade |
| M02944:302:000000000-C5NTF:1:2107:10920:13279_CONS_SUB_SUB | 2 | NA | NA | untargeted clade |
| M02944:302:000000000-C5NTF:1:2106:5336:9643_CONS_SUB_SUB | 2 | NA | NA | untargeted clade |
| M02944:302:000000000-C5NTF:1:2102:11197:10513_CONS_SUB_SUB | 2 | NA | NA | untargeted clade |
| M02944:302:000000000-C5NTF:1:1116:17849:6243_CONS_SUB_SUB_CMP | 2 | NA | NA | untargeted clade |
| M02944:302:000000000-C5NTF:1:2117:11781:7738_CONS_SUB_SUB | 2 | NA | NA | untargeted clade |
| M02944:302:000000000-C5NTF:1:2106:29005:14968_CONS_SUB_SUB_CMP | 2 | NA | NA | untargeted clade |
| M02944:302:000000000-C5NTF:1:1112:4204:11818_CONS_SUB_SUB | 2 | NA | NA | untargeted clade |
| M02944:302:000000000-C5NTF:1:2111:18455:14087_CONS_SUB_SUB | 2 | NA | NA | untargeted clade |
| M02944:302:000000000-C5NTF:1:2103:26901:4810_CONS_SUB_SUB_CMP | 2 | NA | NA | untargeted clade |
| M02944:302:000000000-C5NTF:1:1101:22318:17024_CONS_SUB_SUB | 2 | NA | NA | untargeted clade |
| M02944:302:000000000-C5NTF:1:2113:18410:12985_CONS_SUB_SUB | 2 | NA | NA | untargeted clade |
| M02944:302:000000000-C5NTF:1:2115:18257:15709_CONS_SUB_SUB_CMP | 2 | NA | NA | untargeted clade |
Either super artifactual sequences or unkown bugs; a blast on ncbi also returns no match too. In any cases, they remain rare in the dataset.
Assumption that low % similarity corresponds to degraded sequences,since the 16S is well conserved and now relatively well covered (and the fact that 75% similarity is not far from being a random alignment).
a <-
ggplot(mblist$motus, aes(x=similarity)) +
geom_histogram(color="grey", fill="white", bins=20) +
theme_bw() + scale_x_continuous(limits=c(60, 100)) +
theme(panel.grid = element_blank()) +
labs(x="% similarity against best match", y="# MOTUs")
thresh <- 80
a <-
a + geom_vline(xintercept = thresh, lty=2, color="darkgreen") +
annotate("text", x=thresh-0.02, y=250,
label="Potential\ndegraded MOTUs", col="darkgreen", hjust=1, fontface="bold")
b <-
ggplot(mblist$motus, aes(x=similarity, y = ..count.., weight = count)) +
geom_histogram(color="grey", fill="white", bins=20) +
theme_bw() + scale_x_continuous(limits=c(60, 100)) +
theme(panel.grid = element_blank()) +
labs(x="% similarity against best match", y="# Reads")
b <- b + geom_vline(xintercept = thresh, lty=2, color="darkgreen") +
annotate("text", x=thresh-0.02, y=3e6,
label="Potential highly\ndegraded MOTUs", col="darkgreen", hjust=1, fontface="bold")
ggdraw() +
draw_plot(a, x=0, y=0, width = 0.5) +
draw_plot(b, x=0.5, y=0, width = 0.5)#Flag corresponding motus
conta.deg <- rownames(mblist$motus)[which(mblist$motus$similarity<thresh)]
mblist$motus$bias[match(conta.deg, rownames(mblist$motus))] <-
ifelse(is.na(mblist$motus$bias[match(conta.deg, rownames(mblist$motus))]), "conta deg",
paste(mblist$motus$bias[match(conta.deg, rownames(mblist$motus))], "conta deg", sep="|"))Only one such a sequence -> must have been filtered before, maybe directly in the SILVAngs pipeline (?).
One expects abundant genuine MOTUs to be found in low abundance in samples were they are not supposed to be. The rate of tag-jumps is difficult to anticipate and depends on several things (mainly how the library was prepared). One way to assess this is to remove rare MOTUs at different thresholds of abudnance and see when the sequencing controls are empty.
#define thresholds
thresh <- c(0,1e-5, 1e-4,1e-3, 1e-2, 3e-2, 5e-2)
tests <- lapply(thresh, function(x) tagjumpslayer(mblist,x, method = "cut")) # can be changed to "substract" but not sure it is appropriate
names(tests) <- thresh
#format for plot
tmp <- melt(as.matrix(do.call("rbind", lapply(tests, function(x) rowSums(x$reads)))))
colnames(tmp) <- c("thresh", "sample", "abund_contaout")
tmp$rich_contaout <-
melt(as.matrix(do.call("rbind", lapply(tests, function(x) {
specnumber(x$reads[, is.na(mblist$motus$bias)])
}))))$value
tmp$controls = mblist$pcrs$control_type[match(tmp$sample, rownames(mblist$pcrs))]
tmp2 <- melt(tmp, id.vars=colnames(tmp)[-grep("contaout", colnames(tmp))])
tmp2$variable <- ifelse(tmp2$variable=="rich_contaout", "#MOTUs", "#Reads")
a <-
ggplot(tmp2, aes(x=as.factor(thresh), y=value)) +
geom_boxplot(color="grey40") +
geom_jitter(aes(color=controls), width = 0.2, alpha=0.5) +
scale_color_manual(values = c("brown", "red", "cyan4","pink"), na.value = "darkgrey") +
facet_wrap(~variable+controls, scale="free_y", ncol=5) +
theme_bw() +
labs(x="rel Abundance filtering threshold", y="#Reads/MOTUs") +
theme(panel.grid = element_blank(),
strip.background = element_blank(),
axis.text.x = element_text(angle=40, h=1),
legend.position = "none")
threshfix <- 0.01
a + geom_vline(xintercept = match(threshfix, c(0,1e-5, 1e-4,1e-3, 1e-2, 3e-2, 5e-2)), lty=2,
color="darkgreen")Let’s plot the abundance of the most abundant MOTU before and after filtering.
nkeep <- which(thresh == threshfix)
#plot in PCR plates
ggpcrtag(mblist, legend_title = "Most abundant MOTU abundance before tagjump curation",
FUN = function(m) {mblist$reads[,which.max(mblist$motus$count)]},
taglist = as.character(unique(mblist$pcrs$tag_rev)))#plot in PCR plates
ggpcrtag(mblist, legend_title = "Most abundant MOTU abundance after tagjump curation",
FUN = function(m) {tests[[nkeep]]$reads[,which.max(mblist$motus$count)]},
taglist = as.character(unique(mblist$pcrs$tag_rev)))Identify pcrs with low amounts of reads.
a <-
ggplot(mblist$pcrs, aes(nb_reads)) +
geom_histogram(bins=40, color="grey", fill="white") +
scale_x_log10() +
labs(x="# Reads (with all OTUs and PCRs)",
y="# PCRs") +
theme_bw() +
theme(panel.grid = element_blank())
thresh = 1e2 #based on what can be found in sequencing controls.
a + geom_vline(xintercept = thresh, lty=2, color="darkgreen")## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Removed 289 rows containing non-finite values (stat_bin).
## conta deg conta extraction conta fieldopen conta pcr
## 1 33 47 3
## untargeted clade NA's
## 39 7947
#motus
paste("MOTUs considered as contaminant/spurious sequences: ",
round(sum(!is.na(mblist$motus$bias)) / length(mblist$motus$bias) * 100, 3),
" %")## [1] "MOTUs considered as contaminant/spurious sequences: 1.524 %"
#pcrs
paste("PCRs (control excluded) considered as 'dysfunctional': ",
round(sum(!is.na(mblist$pcrs$pcrbias) & (mblist$pcrs$type == "sample")) / length(mblist$pcrs$pcrbias) * 100, 3),
" %")## [1] "PCRs (control excluded) considered as 'dysfunctional': 3.125 %"
#plots
tmp <- as.data.frame(table(mblist$motus$bias, useNA = "ifany"))
tmp$Perc <- tmp$Freq/sum(tmp$Freq)
tmp$max <- cumsum(tmp$Perc)
tmp$min <- c(0, head(tmp$max, n=-1))
set.seed(9)
cola <- sample(colors(), nlevels(tmp$Var1))
a <-
ggplot(tmp, aes(ymax=max, ymin=min, xmax=4, xmin=3, fill=Var1)) +
geom_rect() + xlim(c(2, 4)) + labs(fill="Bias type") +
coord_polar(theta="y") + theme_void() +
scale_fill_manual(values = cola, na.value = "darkgrey") +
theme(legend.position = "bottom", legend.direction = "vertical")
tmp <- as.data.frame(table(mblist$pcrs$pcrbias[mblist$pcrs$type=="sample"], useNA = "ifany"))
tmp$Perc <- tmp$Freq/sum(tmp$Freq)
tmp$max <- cumsum(tmp$Perc)
tmp$min <- c(0, head(tmp$max, n=-1))
set.seed(3)
colb <- sample(colors(), nlevels(tmp$Var1))
b <-
ggplot(tmp, aes(ymax=max, ymin=min, xmax=4, xmin=3, fill=Var1)) +
geom_rect() + xlim(c(2, 4)) + labs(fill="Bias type") +
coord_polar(theta="y") + theme_void() +
scale_fill_manual(values = colb, na.value = "darkgrey") +
theme(legend.position = "bottom", legend.direction = "vertical")
ggdraw() +
draw_plot(a, x=0, y=0, width=0.5, height=1) +
draw_label("MOTUs", x=0.1, y=0.9) +
draw_plot(b, x=0.5, y=0, width=0.5, height=1) +
draw_label("PCRs", x=0.6, y=0.9)Removal of suprious MOTUs, PCRs as well as adjusting read counts by removing tagjumps. Also ensure that exclude any control.
tmp <- tests[[nkeep]]
tmp$motus$bias <- mblist$motus$bias
tmp$pcrs$pcrbias <- mblist$pcrs$pcrbias
tmp1 <- subset_metabarlist(subset_metabarlist(tmp, "motus", is.na(tmp$motus$bias)),
"pcrs", is.na(tmp$pcrs$pcrbias))
table(tmp1$pcrs$type)##
## control sample
## 102 941
mblist_clean <- subset_metabarlist(tmp1, "pcrs", tmp1$pcrs$type=="sample")
summary_metabarlist(mblist)## $dataset_dimension
## n_row n_col
## reads 1440 8070
## motus 8070 50
## pcrs 1440 19
## samples 986 12
##
## $dataset_statistics
## nb_reads nb_motus avg_reads sd_reads avg_motus sd_motus
## pcrs 11183625 8070 7766.406 13850.99 126.3639 165.1573
## samples 11124692 8047 11282.649 15520.82 175.5984 178.1532
## $dataset_dimension
## n_row n_col
## reads 1043 7944
## motus 7944 50
## pcrs 1043 19
## samples 941 12
##
## $dataset_statistics
## nb_reads nb_motus avg_reads sd_reads avg_motus sd_motus
## pcrs 11038919 7944 10583.81 15315.19 136.1198 178.1605
## samples 10999397 7943 11689.05 15728.86 148.7088 183.1015
## $dataset_dimension
## n_row n_col
## reads 941 7943
## motus 7943 50
## pcrs 941 19
## samples 941 12
##
## $dataset_statistics
## nb_reads nb_motus avg_reads sd_reads avg_motus sd_motus
## pcrs 10999397 7943 11689.05 15728.86 148.7088 183.1015
## samples 10999397 7943 11689.05 15728.86 148.7088 183.1015
Now ensure removing any empty PCR or MOTUs (caused by removal of spurious MOTU or PCR)
if(sum(colSums(mblist_clean$reads)==0)>0){print("empty motus present")}
if(sum(colSums(mblist_clean$reads)==0)>0){print("empty pcrs present")}Update some parameters in the metabarlist (e.g. counts, etc.)
mblist_clean$motus$count <- colSums(mblist_clean$reads)
mblist_clean$pcrs$obitools.reads <- mblist_clean$pcrs$nb_reads
mblist_clean$pcrs$nb_reads <- mblist_clean$pcrs$metabaR.reads <- rowSums(mblist_clean$reads)
mblist_clean$pcrs$obitools.otus <- mblist_clean$pcrs$nb_motus
mblist_clean$pcrs$nb_motus <-
mblist_clean$pcrs$metabaR.otus <-
rowSums(ifelse(mblist_clean$reads > 0, T, F))Compare before and after trimming gross stats
check.bioinfo <-
rbind(data.frame(melt(as.matrix(mblist_clean$pcrs)[,
grep("\\.reads", colnames(mblist_clean$pcrs))]),
type = "reads"),
data.frame(melt(as.matrix(mblist_clean$pcrs)[,
grep("\\.otus", colnames(mblist_clean$pcrs))]),
type = "motus"))
check.bioinfo$variable <- gsub("\\.reads|\\.otus", "", check.bioinfo$Var2)
check.bioinfo$value <- as.numeric(as.vector(check.bioinfo$value))
ggplot(data = check.bioinfo, aes(x = Var2, y = value)) +
geom_boxplot( color = "darkgrey") +
geom_jitter(alpha=0.1, color = "darkgrey") +
theme_bw() + labs(x="Analysis step") +
facet_wrap(~type, scales = "free", ncol = 5) +
theme(axis.text.x = element_text(angle=45, h=1))Compare before and after trimming alpha diversity patterns
raref2 = hill_rarefaction(mblist_clean, nboot = 20, nsteps = 10)
saveRDS(raref2, "~/Dropbox/ECOFEED_microbiote_lizard/2015-2018/lez2015_2018_bioinfo/lez2015_2018_rarcruves_clean.rds")raref2 = readRDS("~/Dropbox/ECOFEED_microbiote_lizard/2015-2018/lez2015_2018_bioinfo/lez2015_2018_rarcruves_clean.rds")
a = gghill_rarefaction(raref)
b = gghill_rarefaction(raref2)
ggdraw() +
draw_plot(a, x=0, y=0.5, width = 1, height = 0.5) +
draw_label("Raw", x=0.05, y=0.95) +
draw_plot(b, x=0, y=0, width = 1, height = 0.5) +
draw_label("Clean", x=0.05, y=0.55)Compare before and after trimming beta diversity patterns
tmp <- subset_metabarlist(mblist, table = "pcr", indices = mblist$pcrs$nb_reads>0 & mblist$pcrs$type=="sample")
library(ecodist)
mds <- cmdscale(bcdist(tmp$reads), k = 2, eig = T, add = T) # bcdist from ecodist much faster than vegan's vegdist
d <- data.frame(mds$points)
d$controls <- tmp$pcrs[rownames(d), "control_type"]
d$year <- tmp$samples[rownames(d), "year"]
a<- ggplot(d, aes(x=X1, y=X2, color=year)) +
geom_point() + theme_bw() +
labs(x=paste("PCoA1 (", round(100*mds$eig[1]/sum(mds$eig),2), "%)", sep=""),
y=paste("PCoA2 (", round(100*mds$eig[2]/sum(mds$eig),2), "%)", sep=""),
shape="control type")
mds2 <- cmdscale(bcdist(mblist_clean$reads), k = 2, eig = T, add = T) # bcdist from ecodist much faster than vegan's vegdist
d2 <- data.frame(mds2$points)
d2$controls <- mblist_clean$pcrs[rownames(d2), "control_type"]
d2$year <- mblist_clean$samples[rownames(d2), "year"]
b <- ggplot(d2, aes(x=X1, y=X2, color=year)) +
geom_point() + theme_bw() +
labs(x=paste("PCoA1 (", round(100*mds2$eig[1]/sum(mds2$eig),2), "%)", sep=""),
y=paste("PCoA2 (", round(100*mds2$eig[2]/sum(mds2$eig),2), "%)", sep=""),
shape="control type")
ggdraw() +
draw_plot(a, x=0, y=0.5, width = 1, height = 0.5) +
draw_label("Raw", x=0.05, y=0.95) +
draw_plot(b, x=0, y=0, width = 1, height = 0.5) +
draw_label("Clean", x=0.05, y=0.55)